{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0184fce5-787c-4f30-9369-fae87e2ade7e",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING: An illegal reflective access operation has occurred\n",
      "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.1.2.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
      "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
      "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
      "WARNING: All illegal access operations will be denied in a future release\n",
      "22/05/18 02:23:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
      "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
      "Setting default log level to \"WARN\".\n",
      "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Debug -- spark init\n",
      "Debug -- version: 3.1.2\n",
      "Debug -- applicaitonId: spark-application-1652840612130\n",
      "Debug -- uiWebUrl: http://jupyter.my.nginx.test/hub/user-redirect/proxy/4040/jobs/\n"
     ]
    }
   ],
   "source": [
    "import metaspore as ms\n",
    "\n",
    "spark_confs={\n",
    "    \"spark.network.timeout\":\"500\",\n",
    "    \"spark.ui.showConsoleProgress\": \"true\",\n",
    "    \"spark.kubernetes.executor.deleteOnTermination\":\"true\",\n",
    "}\n",
    "spark = ms.spark.get_session(local=False,\n",
    "                            app_name='ESMM read data',\n",
    "                            batch_size=256,\n",
    "                            worker_count=4,\n",
    "                            server_count=4,\n",
    "                            worker_memory='5G',\n",
    "                            server_memory='5G',\n",
    "                            coordinator_memory='5G',\n",
    "                            spark_confs=spark_confs)\n",
    "sc = spark.sparkContext\n",
    "print('Debug -- spark init')\n",
    "print('Debug -- version:', sc.version)   \n",
    "print('Debug -- applicaitonId:', sc.applicationId)\n",
    "print('Debug -- uiWebUrl:', sc.uiWebUrl)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "284df78b-0ead-4f2b-8743-5036af6e77a9",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_path = '${MY_S3_BUCKET}/aliccp/traindata_4000w.csv'\n",
    "test_path = '${MY_S3_BUCKET}/aliccp/testdata_4000w.csv'\n",
    "\n",
    "train_output_path ='${MY_S3_BUCKET}/aliccp/traindata_4000w.parquet'\n",
    "test_output_path = '${MY_S3_BUCKET}/aliccp/testdata_4000w.parquet'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "dedd9aa5-78fe-4d2a-ace7-4aa7d88e6478",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.types import StructType, StructField, StringType\n",
    "\n",
    "aliccp_schema = StructType([\n",
    "    StructField(\"click\", StringType(), True),\n",
    "    StructField(\"purchase\", StringType(), True),\n",
    "    StructField(\"101\", StringType(), True),\n",
    "    StructField(\"121\", StringType(), True),\n",
    "    StructField(\"122\", StringType(), True),\n",
    "    StructField(\"124\", StringType(), True),\n",
    "    StructField(\"125\", StringType(), True),\n",
    "    StructField(\"126\", StringType(), True),\n",
    "    StructField(\"127\", StringType(), True),\n",
    "    StructField(\"128\", StringType(), True),\n",
    "    StructField(\"129\", StringType(), True),\n",
    "    StructField(\"205\", StringType(), True),\n",
    "    StructField(\"206\", StringType(), True),\n",
    "    StructField(\"207\", StringType(), True),\n",
    "    StructField(\"216\", StringType(), True),\n",
    "    StructField(\"508\", StringType(), True),\n",
    "    StructField(\"509\", StringType(), True),\n",
    "    StructField(\"702\", StringType(), True),\n",
    "    StructField(\"853\", StringType(), True),\n",
    "    StructField(\"301\", StringType(), True)\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "b3dd596f-8473-446c-a2e1-2579330c00c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = spark.read.csv(train_path, sep=',', inferSchema=False, header=True, schema=aliccp_schema)\n",
    "test_dataset = spark.read.csv(test_path, sep=',', inferSchema=False, header=True, schema=aliccp_schema)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "c57f9538-121c-4da6-94be-df2d602f445d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Stage 4:>                                                          (0 + 1) / 1]\r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+--------+-----+---+---+---+---+---+---+---+---+------+----+------+------+----+------+-----+-----+---+\n",
      "|click|purchase|  101|121|122|124|125|126|127|128|129|   205| 206|   207|   216| 508|   509|  702|  853|301|\n",
      "+-----+--------+-----+---+---+---+---+---+---+---+---+------+----+------+------+----+------+-----+-----+---+\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|6617|241253|     0|3889|     0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|285875|4932|227985|104091|   0|     0|    0|    0|  1|\n",
      "|    1|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|106773|3097| 97188| 64469|5035|     0|14074|  752|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|5971|147495| 87668| 739|     0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2| 45163|3097|220349| 25422|5035|     0|    0|  752|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|6444| 59432| 55177|2915| 34046| 6958|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0| 811| 23736| 82550|   0| 36337|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0| 230|138597| 55177|5787|     0| 6958|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|6444|     0| 64469|2915|     0|14074|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|399833| 692|141046|     0|2812|     0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2| 63132|2106|118244|     0|2549|     0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|2106|257970| 25815|2549|     0|35383|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|438752|6062|186988| 20117|4075|     0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|158157|5694|248021|     0|1330|     0|    0|    0|  1|\n",
      "|    1|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|299367| 230| 68610|     0|5787|     0|    0|17908|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|106081|3097| 29731| 15388|5035|     0|    0|  752|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|2106| 44342| 92327|2549|103992|33390|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|403187|3097|215214|  8928|5035| 32685|18876|  752|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|341091|2702|223080|     0|   0|     0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|104296|3097|202576| 81461|5035|     0|    0|32337|  1|\n",
      "+-----+--------+-----+---+---+---+---+---+---+---+---+------+----+------+------+----+------+-----+-----+---+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "train_dataset.show(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "2a3663b5-eb03-449a-bad2-aa16accd268c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+--------+-----+---+---+---+---+---+---+---+---+------+----+------+-----+----+-----+-----+-----+---+\n",
      "|click|purchase|  101|121|122|124|125|126|127|128|129|   205| 206|   207|  216| 508|  509|  702|  853|301|\n",
      "+-----+--------+-----+---+---+---+---+---+---+---+---+------+----+------+-----+----+-----+-----+-----+---+\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|2691|121419|92603|2258|69003|46330|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|1913|     0|79854|4074|    0| 1873|18559|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2| 60276|2702|140732|15213|   0|    0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0| 230|121419|    0|5787|69003|    0|25901|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|189046|4468| 51249|    0|1480|    0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2| 53047|6272| 36908|90637|   0|    0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|428257|2702|153821|89671|   0|    0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|  7942| 424|227867|18202|1175|    0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|141648| 687| 88353|32653|3373|    0|    0|26926|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|126767|3849|258252|49326|3199|    0|38156|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|165871|5290| 68303|40176|   0|    0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|5971|121419|    0| 739|69003|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|187095|6213|219664|    0|1599|    0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|2691|121419|14599|2258|69003|13654|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|4965| 57196|34044|   0|97012|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|154866|2691|  3419|53608|2258|    0|28848|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|3909| 57196|34044|   0|97012|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|2903| 46605|97750|   0|    0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2| 69349|2228| 92097|69790|   0|    0|    0|    0|  1|\n",
      "|    0|       0|24506| 50|  7|  2|  7|  0|  2|  1|  2|301759|1469|229513| 4865|   0|    0|    0|    0|  1|\n",
      "+-----+--------+-----+---+---+---+---+---+---+---+---+------+----+------+-----+----+-----+-----+-----+---+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test_dataset.show(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "85fd883e-8df1-4e83-bd04-48955e387210",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Stage 11:>                                                         (0 + 1) / 1]\r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+---------+---------+-----+---+---+---+---+---+---+---+---+------+----+------+------+----+------+-----+-----+---+\n",
      "|label|ctr_label|cvr_label|  101|121|122|124|125|126|127|128|129|   205| 206|   207|   216| 508|   509|  702|  853|301|\n",
      "+-----+---------+---------+-----+---+---+---+---+---+---+---+---+------+----+------+------+----+------+-----+-----+---+\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|6617|241253|     0|3889|     0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|285875|4932|227985|104091|   0|     0|    0|    0|  1|\n",
      "|    0|        1|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|106773|3097| 97188| 64469|5035|     0|14074|  752|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|5971|147495| 87668| 739|     0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2| 45163|3097|220349| 25422|5035|     0|    0|  752|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|6444| 59432| 55177|2915| 34046| 6958|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0| 811| 23736| 82550|   0| 36337|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0| 230|138597| 55177|5787|     0| 6958|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|6444|     0| 64469|2915|     0|14074|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|399833| 692|141046|     0|2812|     0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2| 63132|2106|118244|     0|2549|     0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|2106|257970| 25815|2549|     0|35383|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|438752|6062|186988| 20117|4075|     0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|158157|5694|248021|     0|1330|     0|    0|    0|  1|\n",
      "|    0|        1|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|299367| 230| 68610|     0|5787|     0|    0|17908|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|106081|3097| 29731| 15388|5035|     0|    0|  752|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|2106| 44342| 92327|2549|103992|33390|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|403187|3097|215214|  8928|5035| 32685|18876|  752|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|341091|2702|223080|     0|   0|     0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|104296|3097|202576| 81461|5035|     0|    0|32337|  1|\n",
      "+-----+---------+---------+-----+---+---+---+---+---+---+---+---+------+----+------+------+----+------+-----+-----+---+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import functions as F\n",
    "train_dataset_fg = train_dataset.withColumn('label', F.when((F.col('click')=='1')&(F.col('purchase')=='1'), '1').otherwise('0'))\n",
    "train_dataset_fg = train_dataset_fg.select('label', \n",
    "                                            F.col('click').alias('ctr_label'),\n",
    "                                            F.col('purchase').alias('cvr_label'),\n",
    "                                            '101','121','122','124','125','126','127','128','129','205','206','207','216','508','509','702','853','301')\n",
    "train_dataset_fg.show(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "2ade226b-7f70-4abc-b9e1-65279a2d4bb1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Stage 12:>                                                         (0 + 1) / 1]\r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-----+---------+---------+-----+---+---+---+---+---+---+---+---+------+----+------+-----+----+-----+-----+-----+---+\n",
      "|label|ctr_label|cvr_label|  101|121|122|124|125|126|127|128|129|   205| 206|   207|  216| 508|  509|  702|  853|301|\n",
      "+-----+---------+---------+-----+---+---+---+---+---+---+---+---+------+----+------+-----+----+-----+-----+-----+---+\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|2691|121419|92603|2258|69003|46330|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|1913|     0|79854|4074|    0| 1873|18559|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2| 60276|2702|140732|15213|   0|    0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0| 230|121419|    0|5787|69003|    0|25901|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|189046|4468| 51249|    0|1480|    0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2| 53047|6272| 36908|90637|   0|    0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|428257|2702|153821|89671|   0|    0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|  7942| 424|227867|18202|1175|    0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|141648| 687| 88353|32653|3373|    0|    0|26926|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|126767|3849|258252|49326|3199|    0|38156|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|165871|5290| 68303|40176|   0|    0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|5971|121419|    0| 739|69003|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|187095|6213|219664|    0|1599|    0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|2691|121419|14599|2258|69003|13654|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|4965| 57196|34044|   0|97012|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|154866|2691|  3419|53608|2258|    0|28848|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|3909| 57196|34044|   0|97012|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|     0|2903| 46605|97750|   0|    0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2| 69349|2228| 92097|69790|   0|    0|    0|    0|  1|\n",
      "|    0|        0|        0|24506| 50|  7|  2|  7|  0|  2|  1|  2|301759|1469|229513| 4865|   0|    0|    0|    0|  1|\n",
      "+-----+---------+---------+-----+---+---+---+---+---+---+---+---+------+----+------+-----+----+-----+-----+-----+---+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "from pyspark.sql import functions as F\n",
    "test_dataset_fg = test_dataset.withColumn('label', F.when((F.col('click')=='1')&(F.col('purchase')=='1'), '1').otherwise('0'))\n",
    "test_dataset_fg = test_dataset_fg.select('label', \n",
    "                                         F.col('click').alias('ctr_label'),\n",
    "                                         F.col('purchase').alias('cvr_label'),\n",
    "                                         '101','121','122','124','125','126','127','128','129','205','206','207','216','508','509','702','853','301')\n",
    "test_dataset_fg.show(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "f7741b62-3611-4c6d-921f-5244ed70f5e1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "train_dataset_fg.write.parquet(train_output_path, mode=\"overwrite\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "3c908aff-502c-48e5-902f-17375f53fb67",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                                                \r"
     ]
    }
   ],
   "source": [
    "test_dataset_fg.write.parquet(test_output_path, mode=\"overwrite\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "4ddd0ed7-67b6-4f70-9f2b-ba4ae859be1c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2022-05-18 04:07:17          0 _SUCCESS\n",
      "2022-05-18 04:06:45   40365015 part-00000-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:06:46   40281867 part-00001-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:06:44   40068487 part-00002-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:06:44   40675127 part-00003-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:06:45   40386559 part-00004-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:06:45   40444874 part-00005-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:06:46   40602994 part-00006-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:06:49   40516855 part-00007-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:07:00   40667213 part-00008-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:07:00   40648863 part-00009-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:07:00   40815173 part-00010-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:07:01   40522963 part-00011-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:07:00   40309613 part-00012-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:07:02   40201491 part-00013-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:07:04   40559245 part-00014-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:07:05   40617792 part-00015-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:07:16   40172518 part-00016-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:07:16   40321659 part-00017-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n",
      "2022-05-18 04:07:08   13697817 part-00018-f7662809-5030-4a59-a7b4-dea6935c329b-c000.snappy.parquet\n"
     ]
    }
   ],
   "source": [
    "!aws s3 ls ${MY_S3_BUCKET}/aliccp/traindata_4000w.parquet/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "f06de3f8-f24d-4ae8-9e2c-105634813cb2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2022-05-18 04:08:10          0 _SUCCESS\n",
      "2022-05-18 04:07:33   40941588 part-00000-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:32   40434223 part-00001-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:33   40626005 part-00002-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:32   40628926 part-00003-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:33   40638544 part-00004-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:32   40906301 part-00005-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:32   40905847 part-00006-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:38   40720216 part-00007-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:46   40732156 part-00008-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:48   40798508 part-00009-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:48   40945677 part-00010-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:50   41149505 part-00011-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:51   40927151 part-00012-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:54   40551080 part-00013-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:52   40347730 part-00014-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:54   40913138 part-00015-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:08:01   40544125 part-00016-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:08:04   40486015 part-00017-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:08:05   40078809 part-00018-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:08:06   40382034 part-00019-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n",
      "2022-05-18 04:07:56    4102949 part-00020-9c1c191f-bad0-42f3-afd5-b6f41fc724cd-c000.snappy.parquet\n"
     ]
    }
   ],
   "source": [
    "!aws s3 ls ${MY_S3_BUCKET}/aliccp/testdata_4000w.parquet/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "7098d50e-fcf6-477e-b967-9caeb7cfed3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7fae8e6a-c728-4ec6-ab9c-445c4d5932d1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
